; Fur + Real Phong shading implemented if flat assembler
; by Maciej Guba.
; http://macgub.co.pl

ROUND2 equ 10
FUR_SHIFT  = 9
FUR_SIZEXY = 512
FUR_SIZE   = 512*512
FUR_SHELLS = 8
; include "labs.inc" ;  dbg / profiling purpose
;fur_phong_tri_z:
;----Procedure render Phongs shaded triangle with -------
;----arbitrary color given and fur effect.---------------
; Initial block of instructions (sort, deltas and  ------
; calling horizontal line proc loop) - see file  --------
; 3glass_tex.inc  ---------------------------------------

fur_real_phong_line_z:
; in:
;    xmm0 - normal vector 1
;    xmm1 - normal vect 2
;    xmm3 - lo -> hi fur x1, fur y1, z1 coords as dwords float
;    xmm5 - lo -> hi fur x2, fur y2, z2 coords as dwords float
;    xmm2 - lo -> hi y_min, y_max, x_min, x_max
;           as dword integers

;    eax - x1
;    ebx - x2
;    ecx - y
;    edx - -----
;    edi - screen buffer
;    esi - z buffer filled with dword floats

   push  ebp
   mov   ebp,esp
   sub   esp,202
   and   ebp,0xfffffff0
   sub   ebp,64

   .n1      equ [ebp-16]
   .n2      equ [ebp-32]
   .yf      equ [ebp-36]
   .xf      equ [ebp-40]
   .dfy     equ [ebp-44]
   .dfx     equ [ebp-48] ; delta x, y coords of  fur shells
   .xres    equ dword[ebp-52]
   .lx2     equ [ebp-56]
   .z2      equ [ebp-60]
   .z1      equ [ebp-64]
   .screen  equ [ebp-68]
   .zbuff   equ [ebp-72]
   .x_max   equ [ebp-74]
   .x_min   equ [ebp-76]
   .y_max   equ [ebp-78]
   .y_min   equ [ebp-80]
   .dn      equ [ebp-96]
   .dz      equ [ebp-100]
   .fshl_ptr equ dword[ebp-104]
   .y        equ [ebp-108]
   .lx1      equ [ebp-112]


   .cnv      equ [ebp-16]

   .msk_255f equ [ebp+48]
   .zer_h    equ [ebp+32]
   .y_mn     equ [ebp+28]
   .x_mn     equ [ebp+24]
   .y_mx     equ [ebp+20]
   .x_mx     equ [ebp+16]
   .fy1      equ [ebp+12]
   .fx1      equ [ebp+8]     ;fur parameters
   .fy2      equ [ebp+4]
   .fx2      equ [ebp]

        mov       .y,ecx
        movaps    xmm4,xmm2
        packssdw  xmm2,xmm2
        movlps    .y_min,xmm2
        shufps    xmm4,xmm4,00100111b
        mov       edx,[furs_ptr]
        paddd     xmm4,[margin3fur]

        movss     .xres,xmm6
        movaps    .x_mx,xmm4
        mov       .fshl_ptr,edx

        movlps    .fx2,xmm3
        movlps    .fx1,xmm5

        pcmpeqd   xmm2,xmm2        ; generate some constans
        psrldq    xmm2,4
        movhps    .z1,xmm3
        movhps    .z2,xmm5

        andps     xmm0,xmm2  ;[zero_hgst_dd]
        andps     xmm1,xmm2  ;[zero_hgst_dd]
        movaps    .n1,xmm0
        movaps    .n2,xmm1
        psrld     xmm2,24
        cvtdq2ps  xmm2,xmm2
        movaps    .msk_255f,xmm2

        movzx     ecx,word .x_max
        cmp       ebx,ecx
        cmovg     ebx,ecx

        mov       .lx1,eax
        mov       .lx2,ebx

        sub       ebx,eax
        cvtsi2ss  xmm7,ebx
        rcpss     xmm7,xmm7
        shufps    xmm7,xmm7,0
        subps     xmm1,xmm0
        mulps     xmm1,xmm7
        movaps    .dn,xmm1
        movhlps   xmm5,xmm5
        subss     xmm5,.z1
        mulss     xmm5,xmm7
        movss     .dz,xmm5

        movlps    xmm6,.fx2     ; calc fur deltas
        movlps    xmm5,.fx1

        subps     xmm6,xmm5
        mulps     xmm6,xmm7
        movlps    .dfx,xmm6

        mov       ebx,.lx1
        cmp       bx,.x_min     ; clipping on function4
        jge       @f
        movzx     eax,word .x_min
        sub       eax,ebx
        cvtsi2ss  xmm7,eax
        shufps    xmm7,xmm7,0
        mulss     xmm3,xmm7
        mulps     xmm1,xmm7
        mulps     xmm6,xmm7
        movlps    xmm5,.fx1
        addss     xmm3,.z1
        addps     xmm1,.n1
        addps     xmm6,xmm5

        movsx     eax,word .x_min
        movss     .z1,xmm3
        movaps    .n1,xmm1
        movlps    .fx1,xmm6
        mov       dword .lx1,eax

      @@:

        mov       eax,.xres
        mul       dword .y
        add       eax,.lx1
        mov       .zbuff,esi
        mov       .screen,edi
        shl       eax,2
        add       edi,eax
        add       esi,eax
        mov       ecx,.lx2
        sub       ecx,.lx1

        movaps    xmm0,.n1
        movaps    xmm2,.z1
   .ddraw:
        movss     xmm7,xmm2
        cmpnltss  xmm7,dword[esi]
        movd      eax,xmm7
        or        eax,eax
        jnz       .skip
        movss     [esi],xmm2
        movaps    xmm7,xmm0
        dpps      xmm7,xmm7,0xff
        rsqrtps   xmm7,xmm7
        mulps     xmm7,xmm0
        movaps    .cnv,xmm7

        mov       edx,lights_aligned
        xorps     xmm1,xmm1
      @@:
        movaps    xmm5,[edx]
        dpps      xmm5,.cnv,01110111b
     ;  mulps     xmm5,.cnv  ;.lv  ; last dword should be zeroed
        movaps    xmm7,xmm5
        mulps     xmm7,xmm7
        mulps     xmm7,xmm7
        mulps     xmm5,[edx+16]
        mulps     xmm7,xmm7
        mulps     xmm7,[edx+48]   ;xmm3
        addps     xmm5,xmm7

        minps     xmm5,.msk_255f
        maxps     xmm1,xmm5
        add       edx,64
        cmp       edx,lights_aligned_end    ; global
        jnz       @b
        cvtps2dq  xmm1,xmm1
        packssdw  xmm1,xmm1
        xorps     xmm3,xmm3
        packuswb  xmm1,xmm1
        movss     [edi],xmm1
        punpcklbw xmm1,xmm3

        ; fur work
        push      ecx
        push      esi
        movlps    xmm4,.fx1
        maxps     xmm4,xmm3
        sub       esp,8
        cvttps2dq xmm4,xmm4
        ; movd      esi,xmm4
        movaps    xmm3,xmm4
        ; psrldq    xmm4,4
        ; movd      eax,xmm4
        movlps    [esp],xmm4
        pop       esi eax
        shl       eax,FUR_SHIFT
        add       esi,eax
        and       esi,TEXTURE_SIZE
        add       esi,.fshl_ptr
        mov       ecx,1
      .fr:
        cmp       byte[esi],0
        jz        @f
        mov       eax,ecx
        shl       eax,2
        cvtsi2ss  xmm3,eax
        shufps    xmm3,xmm3,11000000b
        mulps     xmm3,.cnv
        movhlps   xmm4,xmm3
        cvtps2dq  xmm3,xmm3
        paddd     xmm3,.lx1
        movlps    .xf,xmm3
        movlhps   xmm3,xmm3
        pcmpgtd   xmm3,.x_mx
        movhlps   xmm6,xmm3
        xorps     xmm6,xmm3
        pmovmskb  eax,xmm6
        and       eax,0xff
        cmp       eax,0xff
        jne       @f

        mov       ebx,.yf
        mov       edx,.xres
        imul      ebx,edx
        add       ebx,.xf
        shl       ebx,2
        mov       eax,ebx
        addss     xmm4,xmm2    ; xmm2 - cur z
        movaps    xmm3,xmm4
        add       ebx,.zbuff
        cmpltss   xmm4,dword[ebx]
        movd      edx,xmm4
        or        edx,edx
        jz        @f
        add       eax,.screen
        movss     [ebx],xmm3
        movzx     ebx,byte[esi]
        shl       ebx,5
        mov       [eax],ebx
      @@:
        add       esi,FUR_SIZE
        inc       ecx
        cmp       ecx,FUR_SHELLS
        jnz       .fr

        pop       esi
        pop       ecx

     .skip:
        movlps    xmm6,.fx1
        add       edi,4
        add       esi,4
        inc       dword .lx1
        addps     xmm6,.dfx
        addps     xmm0,.dn
        addss     xmm2,.dz
        movlps    .fx1,xmm6
        dec       ecx
        jnz       .ddraw

 ; .end_rp_line:
        add       esp,202
        pop       ebp

ret



fur_shell_generator:
  ; in ecx - fur size
  ;    edi - fur ptr
  ;    esi - shels count
  ; each hair 'strand' of fur as byte
  push  ebp
  mov   ebp,esp
  sub   esp,10
  mov   edx,esi
  inc   edx
.lp:
  mov   [rand_seed],10001
  push  ecx
  cld
@@:
  push  edx
  push  ecx
  push  edi
  push  esi
  mov   ecx,0
  mov   edx,5
  sub   edx,esi
  call  random
  pop   esi
  pop   edi
  pop   ecx
  pop   edx
  stosb
  loop  @b
  pop   ecx
  dec   esi
  jnz   .lp


  mov   esp,ebp
  pop   ebp
ret
